R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(readr)
diabetes_dataset <- read_csv("diabetes_prediction_dataset.csv")
## Rows: 100000 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): gender, smoking_history
## dbl (7): age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_l...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(diabetes_dataset)

Including Plots

You can also embed plots, for example:

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# male datatset
male_data = diabetes_dataset %>% filter(gender == "Male")
# female dataset
female_data = diabetes_dataset %>% filter(gender == "Female")
female_data
## # A tibble: 58,552 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Female    36            0             0 current          23.4         5  
##  4 Female    20            0             0 never            27.3         6.6
##  5 Female    44            0             0 never            19.3         6.5
##  6 Female    79            0             0 No Info          23.9         5.7
##  7 Female    32            0             0 never            27.3         5  
##  8 Female    53            0             0 never            27.3         6.1
##  9 Female    54            0             0 former           54.7         6  
## 10 Female    78            0             0 former           36.0         5  
## # ℹ 58,542 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# males and females within original dataset that have a "normal" A1C

female_data %>% filter(HbA1c_level <= 5.7) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 27397
male_data %>% filter(HbA1c_level <= 5.7) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 18865
# count of people (male and female) with both heart disease and diabetes
diabetes_dataset %>% filter(diabetes == 1, heart_disease == 1) # this is saying how many rows are in the data group and tally
## # A tibble: 1,267 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      67            0             1 not current      27.3         6.5
##  2 Male      57            1             1 not current      27.8         6.6
##  3 Male      80            0             1 former           24.4         7.5
##  4 Male      75            0             1 not current      28.1         7.5
##  5 Male      69            0             1 former           24.1         6.8
##  6 Female    59            0             1 never            60.3         8.8
##  7 Male      80            0             1 former           33.0         6  
##  8 Female    62            1             1 former           44.2         8.2
##  9 Female    62            1             1 never            43.2         8.8
## 10 Female    76            0             1 former           25.7         9  
## # ℹ 1,257 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# count of overweight people based on bmi who have heart disease
diabetes_dataset %>% group_by(bmi >= 30) %>% filter(heart_disease == 1)
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>
diabetes_dataset %>% group_by(bmi >= 30) %>% filter(heart_disease == 1) 
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>
# "obese men" with bmi higher than 30 and that have diabetes (tally on second line)
male_data %>% filter(bmi >= 30, diabetes == 1)
## # A tibble: 1,903 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      50            0             0 former           37.2         9  
##  2 Male      53            0             0 current          30.8         6.6
##  3 Male      76            0             0 never            31.9         7.5
##  4 Male      63            1             0 ever             35.1         5.8
##  5 Male      48            1             0 current          36.1         6.8
##  6 Male      37            0             0 never            37.2         7  
##  7 Male      36            0             0 not current      46.1         6.2
##  8 Male      50            0             0 never            31.8         7.5
##  9 Male      43            0             0 never            69.4         7.5
## 10 Male      43            1             0 not current      40.9         6.6
## # ℹ 1,893 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
male_data %>% filter(bmi >= 30, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1903
# "obese women" with bmi higher than 30 and that have diabetes (tally on second line)
female_data %>% filter(bmi >= 30, diabetes == 1)
## # A tibble: 2,330 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    67            0             0 never            63.5         8.8
##  2 Female    36            0             0 current          32.3         6.2
##  3 Female    77            0             0 never            31.7         6.5
##  4 Female    47            0             0 never            36.5         7.5
##  5 Female    61            0             0 not current      39.4         9  
##  6 Female    80            0             0 former           36.2         6.5
##  7 Female    52            1             0 never            50.3         6.6
##  8 Female    68            0             0 No Info          40.3         7.5
##  9 Female    70            0             0 not current      33.2         7.5
## 10 Female    67            0             0 former           32.3         7  
## # ℹ 2,320 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
female_data %>% filter(bmi >= 30, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2330
# "underweight men"  with bmi lower than 19 and that have diabetes (tally on second line)
male_data %>% filter(bmi <= 19, diabetes == 1)
## # A tibble: 21 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      42            0             0 current          11.9         6  
##  2 Male       6            0             0 never            15.7         6.1
##  3 Male      71            1             0 former           13.2         6.6
##  4 Male      14            0             0 never            19.0         6.6
##  5 Male      54            0             0 never            18.9         6  
##  6 Male      61            1             0 never            18.4         6.5
##  7 Male       4            0             0 never            18.7         6  
##  8 Male      51            0             0 current          17.8         6.2
##  9 Male      80            1             0 current          19.0         6.6
## 10 Male       6            0             0 No Info          15.6         9  
## # ℹ 11 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
male_data %>% filter(bmi <= 19, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    21
# "underweight women"  with bmi lower than 19 and that have diabetes (tally on second line)
female_data %>% filter(bmi <= 19, diabetes == 1)
## # A tibble: 57 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    79            0             0 not current      18.1         7  
##  2 Female     4            0             0 No Info          15.0         6.5
##  3 Female    51            0             0 current          17.4         7  
##  4 Female     9            0             0 never            16           6.1
##  5 Female    60            0             0 No Info          17.9         8.2
##  6 Female    13            0             0 No Info          17.3         6.2
##  7 Female    80            0             0 never            17.4         6.5
##  8 Female     8            0             0 No Info          14.3         7.5
##  9 Female    80            0             0 never            17.8         6.2
## 10 Female    78            1             0 not current      17.7         8.8
## # ℹ 47 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
female_data %>% filter(bmi <= 19, diabetes == 1) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1    57
# the assumption is that overweight people are more likely to have diabetes. Below is the code and tally of MEN who are overweight in terms of bmi and DONT have diabetes
male_data %>% filter(bmi >= 30, diabetes == 0)
## # A tibble: 7,445 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      42            0             0 never            33.6         4.8
##  2 Male      15            0             0 never            30.4         6.1
##  3 Male      40            0             0 current          36.4         6  
##  4 Male      30            0             0 never            33.8         6.1
##  5 Male      34            0             0 never            31.2         5.8
##  6 Male      54            0             0 never            31.9         6.6
##  7 Male      79            0             0 former           31.2         5.8
##  8 Male      54            0             0 former           32.8         5  
##  9 Male      38            0             0 never            55.6         6.5
## 10 Male      58            0             0 former           36.5         5.8
## # ℹ 7,435 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
male_data %>% filter(bmi >= 30, diabetes == 0) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  7445
# the assumption is that overweight people are more likely to have diabetes. Below is the code and tally of WOMEN who are overweight in terms of bmi and DONT have diabetes
female_data %>% filter(bmi >= 30, diabetes == 0)
## # A tibble: 11,852 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    54            0             0 former           54.7         6  
##  2 Female    78            0             0 former           36.0         5  
##  3 Female    53            0             0 No Info          31.8         4  
##  4 Female    34            0             0 never            56.4         6.2
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    27            0             0 not current      30.2         5.7
##  7 Female    37            0             0 No Info          30.5         5.7
##  8 Female    56            0             0 never            31.0         6.5
##  9 Female    44            0             0 never            37.4         5.7
## 10 Female    30            0             0 No Info          50.1         6  
## # ℹ 11,842 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
female_data %>% filter(bmi >= 30, diabetes == 0) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 11852

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# in this section we want to display a stack bar showing how HbA1c levels (average blood sugar levels) can classify Males and Females in different categories such as Normal, Prediabetes, and Diabetes. From this plot we gain insight on how the distribution of Normal, Prediabetes, and Diabetes varies between both genders.

# first we load our libraries such as dplyr for data manipulation, ggplot2 for data visualization, and plotly to make our plot interactive.
library(dplyr)
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# here we display our original dataset
diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# from our original data set we create a new dataset called HbA1c_by_gender. For our new dataset we want to keep only the Male and Female gender and exclude Other, the way to do this is by filtering our gender column to not equal to 'Other'. We also create a new variable called HbA1c_category with the use of our mutate function. We use the case_when function to classify our HbA1c_level column and return different categories such as 'Normal', 'Prediabetes', and 'Diabetes'.
HbA1c_by_gender <- diabetes_dataset %>% filter(gender != 'Other') %>% 
  mutate(HbA1c_category = case_when(
    HbA1c_level < 5.7 ~ "Normal < 5.7%",
    HbA1c_level >= 5.7 & HbA1c_level < 6.5 ~ "Prediabetes 5.7% - 6.4%",
    HbA1c_level >= 6.5 ~ "Diabetes ≥ 6.5%",
    TRUE ~ NA_character_  # handles unexpected cases that don’t match the defined conditions keeping missing values as NA
    ))

# here we print out our mutated dataset to make a comparison of the original.
HbA1c_by_gender
## # A tibble: 99,982 × 10
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,972 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   HbA1c_category <chr>
# in this section we want our stack bar to follow a certain order, we want Normal at the top, Prediabetes at the middle, and Diabetes at the bottom. To do this we will mutate our HbA1c_category column to an ordered categorical variable with the use of the factor function, the levels argument helps to set the order we want.
HbA1c_by_gender <- HbA1c_by_gender %>%
  mutate(HbA1c_category = factor(HbA1c_category, levels = c("Normal < 5.7%", "Prediabetes 5.7% - 6.4%", "Diabetes ≥ 6.5%")))

# now we will plot a stack bar using ggplot. Before that we will change the name of our data set to 'Interactive_mode' that way we can easily incorporate our data set to ggplotly. The scale_fill_manual function is used to manually assign colors to the different categories.
interactive_mode <- HbA1c_by_gender %>% ggplot(aes(x = gender, fill = HbA1c_category)) +
  geom_bar(position = "stack") +
  scale_fill_manual(values = c("Normal < 5.7%" = "cornsilk2", "Prediabetes 5.7% - 6.4%" = "darkkhaki", "Diabetes ≥ 6.5%" = "darkgoldenrod")) +
  labs(title = "Male vs. Female HbA1c Levels",
       x = "Gender",
       y = "Count",
       fill = "HbA1c Category") +
  theme_classic()  +
  theme(plot.title = element_text(hjust = 0.5)) # this code adjusts the tittle to the middle

# by using ggplotly we transform our ggplot into an interactive plot. When approaching the table we are able to see the count, gender, and HbA1c_category for any of the stacked bars.
ggplotly(interactive_mode)
# here I'll leave extra info for you guys regarding the gender column of the original data set
diabetes_dataset %>% filter(gender == 'Female') %>% tally # 58,552 we have 17,122 more females than males in this data set
## # A tibble: 1 × 1
##       n
##   <int>
## 1 58552
diabetes_dataset %>% filter(gender == 'Male') %>% tally   # 41,430  
## # A tibble: 1 × 1
##       n
##   <int>
## 1 41430
diabetes_dataset %>% filter(gender == 'Other') %>% tally  # 18  
## # A tibble: 1 × 1
##       n
##   <int>
## 1    18